/*
** $Id: llex.c $
** Lexical Analyzer
** See Copyright Notice in lua.h
*/

#define llex_c
#define LUA_CORE

#include "lprefix.h"

#include <locale.h>
#include <string.h>

#include "lua.h"

#include "lctype.h"
#include "ldebug.h"
#include "ldo.h"
#include "lgc.h"
#include "llex.h"
#include "lobject.h"
#include "lparser.h"
#include "lstate.h"
#include "lstring.h"
#include "ltable.h"
#include "lzio.h"

#define next(ls)          (ls->current = zgetc(ls->z))

#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r')

/* ORDER RESERVED */
static const char *const luaX_tokens[] = {
    "and", "break", "do", "else", "elseif",
    "end", "false", "for", "function", "goto", "if",
    "in", "local", "nil", "not", "or", "repeat",
    "return", "then", "true", "until", "while",
    "//", "..", "...", "==", ">=", "<=", "~=",
    "<<", ">>", "::", "<eof>",
    "<number>", "<integer>", "<name>", "<string>"
};

#define save_and_next(ls) (save(ls, ls->current), next(ls))

static l_noret lexerror(LexState *ls, const char *msg, int token);

static void save(LexState *ls, int c)
{
    Mbuffer *b = ls->buff;
    if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
        size_t newsize;
        if (luaZ_sizebuffer(b) >= MAX_SIZE / 2)
            lexerror(ls, "lexical element too long", 0);
        newsize = luaZ_sizebuffer(b) * 2;
        luaZ_resizebuffer(ls->L, b, newsize);
    }
    b->buffer[luaZ_bufflen(b)++] = cast_char(c);
}

void luaX_init(lua_State *L)
{
    int i;
    TString *e = luaS_newliteral(L, LUA_ENV); /* create env name */
    luaC_fix(L, obj2gco(e));                  /* never collect this name */
    for (i = 0; i < NUM_RESERVED; i++) {
        TString *ts = luaS_new(L, luaX_tokens[i]);
        luaC_fix(L, obj2gco(ts));     /* reserved words are never collected */
        ts->extra = cast_byte(i + 1); /* reserved word */
    }
}

const char *luaX_token2str(LexState *ls, int token)
{
    if (token < FIRST_RESERVED) { /* single-byte symbols? */
        if (lisprint(token))
            return luaO_pushfstring(ls->L, "'%c'", token);
        else /* control character */
            return luaO_pushfstring(ls->L, "'<\\%d>'", token);
    } else {
        const char *s = luaX_tokens[token - FIRST_RESERVED];
        if (token < TK_EOS) /* fixed format (symbols and reserved words)? */
            return luaO_pushfstring(ls->L, "'%s'", s);
        else /* names, strings, and numerals */
            return s;
    }
}

static const char *txtToken(LexState *ls, int token)
{
    switch (token) {
        case TK_NAME:
        case TK_STRING:
        case TK_FLT:
        case TK_INT:
            save(ls, '\0');
            return luaO_pushfstring(ls->L, "'%s'", luaZ_buffer(ls->buff));
        default:
            return luaX_token2str(ls, token);
    }
}

static l_noret lexerror(LexState *ls, const char *msg, int token)
{
    msg = luaG_addinfo(ls->L, msg, ls->source, ls->linenumber);
    if (token)
        luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
    luaD_throw(ls->L, LUA_ERRSYNTAX);
}

l_noret luaX_syntaxerror(LexState *ls, const char *msg)
{
    lexerror(ls, msg, ls->t.token);
}

/*
** Creates a new string and anchors it in scanner's table so that it
** will not be collected until the end of the compilation; by that time
** it should be anchored somewhere. It also internalizes long strings,
** ensuring there is only one copy of each unique string.  The table
** here is used as a set: the string enters as the key, while its value
** is irrelevant. We use the string itself as the value only because it
** is a TValue readly available. Later, the code generation can change
** this value.
*/
TString *luaX_newstring(LexState *ls, const char *str, size_t l)
{
    lua_State *L = ls->L;
    TString *ts = luaS_newlstr(L, str, l); /* create new string */
    const TValue *o = luaH_getstr(ls->h, ts);
    if (!ttisnil(o))                           /* string already present? */
        ts = keystrval(nodefromval(o));        /* get saved copy */
    else {                                     /* not in use yet */
        TValue *stv = s2v(L->top++);           /* reserve stack space for string */
        setsvalue(L, stv, ts);                 /* temporarily anchor the string */
        luaH_finishset(L, ls->h, stv, o, stv); /* t[string] = string */
        /* table is not a metatable, so it does not need to invalidate cache */
        luaC_checkGC(L);
        L->top--; /* remove string from stack */
    }
    return ts;
}

/*
** increment line number and skips newline sequence (any of
** \n, \r, \n\r, or \r\n)
*/
static void inclinenumber(LexState *ls)
{
    int old = ls->current;
    lua_assert(currIsNewline(ls));
    next(ls); /* skip '\n' or '\r' */
    if (currIsNewline(ls) && ls->current != old)
        next(ls); /* skip '\n\r' or '\r\n' */
    if (++ls->linenumber >= MAX_INT)
        lexerror(ls, "chunk has too many lines", 0);
}

void luaX_setinput(lua_State *L, LexState *ls, ZIO *z, TString *source,
                   int firstchar)
{
    ls->t.token = 0;
    ls->L = L;
    ls->current = firstchar;
    ls->lookahead.token = TK_EOS; /* no look-ahead token */
    ls->z = z;
    ls->fs = NULL;
    ls->linenumber = 1;
    ls->lastline = 1;
    ls->source = source;
    ls->envn = luaS_newliteral(L, LUA_ENV);            /* get env name */
    luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */
}

/*
** =======================================================
** LEXICAL ANALYZER
** =======================================================
*/

static int check_next1(LexState *ls, int c)
{
    if (ls->current == c) {
        next(ls);
        return 1;
    } else
        return 0;
}

/*
** Check whether current char is in set 'set' (with two chars) and
** saves it
*/
static int check_next2(LexState *ls, const char *set)
{
    lua_assert(set[2] == '\0');
    if (ls->current == set[0] || ls->current == set[1]) {
        save_and_next(ls);
        return 1;
    } else
        return 0;
}

/* LUA_NUMBER */
/*
** This function is quite liberal in what it accepts, as 'luaO_str2num'
** will reject ill-formed numerals. Roughly, it accepts the following
** pattern:
**
**   %d(%x|%.|([Ee][+-]?))* | 0[Xx](%x|%.|([Pp][+-]?))*
**
** The only tricky part is to accept [+-] only after a valid exponent
** mark, to avoid reading '3-4' or '0xe+1' as a single number.
**
** The caller might have already read an initial dot.
*/
static int read_numeral(LexState *ls, SemInfo *seminfo)
{
    TValue obj;
    const char *expo = "Ee";
    int first = ls->current;
    lua_assert(lisdigit(ls->current));
    save_and_next(ls);
    if (first == '0' && check_next2(ls, "xX")) /* hexadecimal? */
        expo = "Pp";
    for (;;) {
        if (check_next2(ls, expo))                             /* exponent mark? */
            check_next2(ls, "-+");                             /* optional exponent sign */
        else if (lisxdigit(ls->current) || ls->current == '.') /* '%x|%.' */
            save_and_next(ls);
        else
            break;
    }
    if (lislalpha(ls->current)) /* is numeral touching a letter? */
        save_and_next(ls);      /* force an error */
    save(ls, '\0');
    if (luaO_str2num(luaZ_buffer(ls->buff), &obj) == 0) /* format error? */
        lexerror(ls, "malformed number", TK_FLT);
    if (ttisinteger(&obj)) {
        seminfo->i = ivalue(&obj);
        return TK_INT;
    } else {
        lua_assert(ttisfloat(&obj));
        seminfo->r = fltvalue(&obj);
        return TK_FLT;
    }
}

/*
** read a sequence '[=*[' or ']=*]', leaving the last bracket. If
** sequence is well formed, return its number of '='s + 2; otherwise,
** return 1 if it is a single bracket (no '='s and no 2nd bracket);
** otherwise (an unfinished '[==...') return 0.
*/
static size_t skip_sep(LexState *ls)
{
    size_t count = 0;
    int s = ls->current;
    lua_assert(s == '[' || s == ']');
    save_and_next(ls);
    while (ls->current == '=') {
        save_and_next(ls);
        count++;
    }
    return (ls->current == s) ? count + 2 : (count == 0) ? 1 :
                                                           0;
}

static void read_long_string(LexState *ls, SemInfo *seminfo, size_t sep)
{
    int line = ls->linenumber; /* initial line (for error message) */
    save_and_next(ls);         /* skip 2nd '[' */
    if (currIsNewline(ls))     /* string starts with a newline? */
        inclinenumber(ls);     /* skip it */
    for (;;) {
        switch (ls->current) {
            case EOZ: { /* error */
                const char *what = (seminfo ? "string" : "comment");
                const char *msg = luaO_pushfstring(ls->L,
                                                   "unfinished long %s (starting at line %d)", what, line);
                lexerror(ls, msg, TK_EOS);
                break; /* to avoid warnings */
            }
            case ']': {
                if (skip_sep(ls) == sep) {
                    save_and_next(ls); /* skip 2nd ']' */
                    goto endloop;
                }
                break;
            }
            case '\n':
            case '\r': {
                save(ls, '\n');
                inclinenumber(ls);
                if (!seminfo)
                    luaZ_resetbuffer(ls->buff); /* avoid wasting space */
                break;
            }
            default: {
                if (seminfo)
                    save_and_next(ls);
                else
                    next(ls);
            }
        }
    }
endloop:
    if (seminfo)
        seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + sep,
                                     luaZ_bufflen(ls->buff) - 2 * sep);
}

static void esccheck(LexState *ls, int c, const char *msg)
{
    if (!c) {
        if (ls->current != EOZ)
            save_and_next(ls); /* add current to buffer for error message */
        lexerror(ls, msg, TK_STRING);
    }
}

static int gethexa(LexState *ls)
{
    save_and_next(ls);
    esccheck(ls, lisxdigit(ls->current), "hexadecimal digit expected");
    return luaO_hexavalue(ls->current);
}

static int readhexaesc(LexState *ls)
{
    int r = gethexa(ls);
    r = (r << 4) + gethexa(ls);
    luaZ_buffremove(ls->buff, 2); /* remove saved chars from buffer */
    return r;
}

static unsigned long readutf8esc(LexState *ls)
{
    unsigned long r;
    int i = 4;         /* chars to be removed: '\', 'u', '{', and first digit */
    save_and_next(ls); /* skip 'u' */
    esccheck(ls, ls->current == '{', "missing '{'");
    r = gethexa(ls); /* must have at least one digit */
    while (cast_void(save_and_next(ls)), lisxdigit(ls->current)) {
        i++;
        esccheck(ls, r <= (0x7FFFFFFFu >> 4), "UTF-8 value too large");
        r = (r << 4) + luaO_hexavalue(ls->current);
    }
    esccheck(ls, ls->current == '}', "missing '}'");
    next(ls);                     /* skip '}' */
    luaZ_buffremove(ls->buff, i); /* remove saved chars from buffer */
    return r;
}

static void utf8esc(LexState *ls)
{
    char buff[UTF8BUFFSZ];
    int n = luaO_utf8esc(buff, readutf8esc(ls));
    for (; n > 0; n--) /* add 'buff' to string */
        save(ls, buff[UTF8BUFFSZ - n]);
}

static int readdecesc(LexState *ls)
{
    int i;
    int r = 0;                                         /* result accumulator */
    for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */
        r = 10 * r + ls->current - '0';
        save_and_next(ls);
    }
    esccheck(ls, r <= UCHAR_MAX, "decimal escape too large");
    luaZ_buffremove(ls->buff, i); /* remove read digits from buffer */
    return r;
}

static void read_string(LexState *ls, int del, SemInfo *seminfo)
{
    save_and_next(ls); /* keep delimiter (for error messages) */
    while (ls->current != del) {
        switch (ls->current) {
            case EOZ:
                lexerror(ls, "unfinished string", TK_EOS);
                break; /* to avoid warnings */
            case '\n':
            case '\r':
                lexerror(ls, "unfinished string", TK_STRING);
                break;             /* to avoid warnings */
            case '\\': {           /* escape sequences */
                int c;             /* final character to be saved */
                save_and_next(ls); /* keep '\\' for error messages */
                switch (ls->current) {
                    case 'a':
                        c = '\a';
                        goto read_save;
                    case 'b':
                        c = '\b';
                        goto read_save;
                    case 'f':
                        c = '\f';
                        goto read_save;
                    case 'n':
                        c = '\n';
                        goto read_save;
                    case 'r':
                        c = '\r';
                        goto read_save;
                    case 't':
                        c = '\t';
                        goto read_save;
                    case 'v':
                        c = '\v';
                        goto read_save;
                    case 'x':
                        c = readhexaesc(ls);
                        goto read_save;
                    case 'u':
                        utf8esc(ls);
                        goto no_save;
                    case '\n':
                    case '\r':
                        inclinenumber(ls);
                        c = '\n';
                        goto only_save;
                    case '\\':
                    case '\"':
                    case '\'':
                        c = ls->current;
                        goto read_save;
                    case EOZ:
                        goto no_save;                 /* will raise an error next loop */
                    case 'z': {                       /* zap following span of spaces */
                        luaZ_buffremove(ls->buff, 1); /* remove '\\' */
                        next(ls);                     /* skip the 'z' */
                        while (lisspace(ls->current)) {
                            if (currIsNewline(ls))
                                inclinenumber(ls);
                            else
                                next(ls);
                        }
                        goto no_save;
                    }
                    default: {
                        esccheck(ls, lisdigit(ls->current), "invalid escape sequence");
                        c = readdecesc(ls); /* digital escape '\ddd' */
                        goto only_save;
                    }
                }
            read_save:
                next(ls);
                /* go through */
            only_save:
                luaZ_buffremove(ls->buff, 1); /* remove '\\' */
                save(ls, c);
                /* go through */
            no_save:
                break;
            }
            default:
                save_and_next(ls);
        }
    }
    save_and_next(ls); /* skip delimiter */
    seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
                                 luaZ_bufflen(ls->buff) - 2);
}

static int llex(LexState *ls, SemInfo *seminfo)
{
    luaZ_resetbuffer(ls->buff);
    for (;;) {
        switch (ls->current) {
            case '\n':
            case '\r': { /* line breaks */
                inclinenumber(ls);
                break;
            }
            case ' ':
            case '\f':
            case '\t':
            case '\v': { /* spaces */
                next(ls);
                break;
            }
            case '-': { /* '-' or '--' (comment) */
                next(ls);
                if (ls->current != '-')
                    return '-';
                /* else is a comment */
                next(ls);
                if (ls->current == '[') { /* long comment? */
                    size_t sep = skip_sep(ls);
                    luaZ_resetbuffer(ls->buff); /* 'skip_sep' may dirty the buffer */
                    if (sep >= 2) {
                        read_long_string(ls, NULL, sep); /* skip long comment */
                        luaZ_resetbuffer(ls->buff);      /* previous call may dirty the buff. */
                        break;
                    }
                }
                /* else short comment */
                while (!currIsNewline(ls) && ls->current != EOZ)
                    next(ls); /* skip until end of line (or end of file) */
                break;
            }
            case '[': { /* long string or simply '[' */
                size_t sep = skip_sep(ls);
                if (sep >= 2) {
                    read_long_string(ls, seminfo, sep);
                    return TK_STRING;
                } else if (sep == 0) /* '[=...' missing second bracket? */
                    lexerror(ls, "invalid long string delimiter", TK_STRING);
                return '[';
            }
            case '=': {
                next(ls);
                if (check_next1(ls, '='))
                    return TK_EQ; /* '==' */
                else
                    return '=';
            }
            case '<': {
                next(ls);
                if (check_next1(ls, '='))
                    return TK_LE; /* '<=' */
                else if (check_next1(ls, '<'))
                    return TK_SHL; /* '<<' */
                else
                    return '<';
            }
            case '>': {
                next(ls);
                if (check_next1(ls, '='))
                    return TK_GE; /* '>=' */
                else if (check_next1(ls, '>'))
                    return TK_SHR; /* '>>' */
                else
                    return '>';
            }
            case '/': {
                next(ls);
                if (check_next1(ls, '/'))
                    return TK_IDIV; /* '//' */
                else
                    return '/';
            }
            case '~': {
                next(ls);
                if (check_next1(ls, '='))
                    return TK_NE; /* '~=' */
                else
                    return '~';
            }
            case ':': {
                next(ls);
                if (check_next1(ls, ':'))
                    return TK_DBCOLON; /* '::' */
                else
                    return ':';
            }
            case '"':
            case '\'': { /* short literal strings */
                read_string(ls, ls->current, seminfo);
                return TK_STRING;
            }
            case '.': { /* '.', '..', '...', or number */
                save_and_next(ls);
                if (check_next1(ls, '.')) {
                    if (check_next1(ls, '.'))
                        return TK_DOTS; /* '...' */
                    else
                        return TK_CONCAT; /* '..' */
                } else if (!lisdigit(ls->current))
                    return '.';
                else
                    return read_numeral(ls, seminfo);
            }
            case '0':
            case '1':
            case '2':
            case '3':
            case '4':
            case '5':
            case '6':
            case '7':
            case '8':
            case '9': {
                return read_numeral(ls, seminfo);
            }
            case EOZ: {
                return TK_EOS;
            }
            default: {
                if (lislalpha(ls->current)) { /* identifier or reserved word? */
                    TString *ts;
                    do {
                        save_and_next(ls);
                    } while (lislalnum(ls->current));
                    ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
                                        luaZ_bufflen(ls->buff));
                    seminfo->ts = ts;
                    if (isreserved(ts)) /* reserved word? */
                        return ts->extra - 1 + FIRST_RESERVED;
                    else {
                        return TK_NAME;
                    }
                } else { /* single-char tokens ('+', '*', '%', '{', '}', ...) */
                    int c = ls->current;
                    next(ls);
                    return c;
                }
            }
        }
    }
}

void luaX_next(LexState *ls)
{
    ls->lastline = ls->linenumber;
    if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */
        ls->t = ls->lookahead;           /* use this one */
        ls->lookahead.token = TK_EOS;    /* and discharge it */
    } else
        ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */
}

int luaX_lookahead(LexState *ls)
{
    lua_assert(ls->lookahead.token == TK_EOS);
    ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
    return ls->lookahead.token;
}
